{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2025-08-01T03:11:20.065347Z",
     "start_time": "2025-08-01T03:11:18.121871Z"
    }
   },
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ],
   "outputs": [],
   "execution_count": 2
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 数据导入导出\n",
    "# csv = pd.read_csv('data/employees.csv')\n",
    "# print(csv)\n",
    "# \n",
    "# csv = csv.tail()\n",
    "# new = csv.to_csv('data/new.csv')\n",
    "# print(new)\n",
    "\n",
    "# json\n",
    "# json = pd.read_json('data/data1.json')\n",
    "# print(json)\n",
    "\n",
    "# 缺失值\n",
    "s = pd.Series([1, 2, np.nan, None, pd.NA])\n",
    "# print(s)\n",
    "print(s.isna())\n",
    "print(s.isnull())"
   ],
   "id": "edb8cc297fa00c1d",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-08-01T03:11:21.752386Z",
     "start_time": "2025-08-01T03:11:21.717480Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 企鹅数据分析\n",
    "penguins = pd.read_csv('data/penguins.csv')\n",
    "penguins.head(5)"
   ],
   "id": "bcf8c7c339603aa6",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \\\n",
       "0  Adelie  Torgersen            39.1           18.7              181.0   \n",
       "1  Adelie  Torgersen            39.5           17.4              186.0   \n",
       "2  Adelie  Torgersen            40.3           18.0              195.0   \n",
       "3  Adelie  Torgersen             NaN            NaN                NaN   \n",
       "4  Adelie  Torgersen            36.7           19.3              193.0   \n",
       "\n",
       "   body_mass_g     sex  \n",
       "0       3750.0    Male  \n",
       "1       3800.0  Female  \n",
       "2       3250.0  Female  \n",
       "3          NaN     NaN  \n",
       "4       3450.0  Female  "
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>species</th>\n",
       "      <th>island</th>\n",
       "      <th>bill_length_mm</th>\n",
       "      <th>bill_depth_mm</th>\n",
       "      <th>flipper_length_mm</th>\n",
       "      <th>body_mass_g</th>\n",
       "      <th>sex</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>39.1</td>\n",
       "      <td>18.7</td>\n",
       "      <td>181.0</td>\n",
       "      <td>3750.0</td>\n",
       "      <td>Male</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>39.5</td>\n",
       "      <td>17.4</td>\n",
       "      <td>186.0</td>\n",
       "      <td>3800.0</td>\n",
       "      <td>Female</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>40.3</td>\n",
       "      <td>18.0</td>\n",
       "      <td>195.0</td>\n",
       "      <td>3250.0</td>\n",
       "      <td>Female</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>36.7</td>\n",
       "      <td>19.3</td>\n",
       "      <td>193.0</td>\n",
       "      <td>3450.0</td>\n",
       "      <td>Female</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 3
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-08-01T03:27:49.155749Z",
     "start_time": "2025-08-01T03:27:49.132167Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 1、数据清洗\n",
    "penguins.isna().sum()\n",
    "penguins.dropna(inplace=True)\n",
    "penguins.isna().sum()\n",
    "# 2、数据特征构造\n",
    "# 3、数据分析\n",
    "labels = ['低', '中', '高']\n",
    "penguins['mass_g_label'] = pd.cut(penguins['body_mass_g'], bins=3, labels=labels)\n",
    "print(penguins['mass_g_label'].value_counts())\n",
    "\n",
    "penguins.groupby(['sex', 'island']).agg({'body_mass_g': ['mean', 'count']})"
   ],
   "id": "741e73d34a77d5ff",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mass_g_label\n",
      "低    150\n",
      "中    128\n",
      "高     55\n",
      "Name: count, dtype: int64\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "                  body_mass_g      \n",
       "                         mean count\n",
       "sex    island                      \n",
       "Female Biscoe     4319.375000    80\n",
       "       Dream      3446.311475    61\n",
       "       Torgersen  3395.833333    24\n",
       "Male   Biscoe     5104.518072    83\n",
       "       Dream      3987.096774    62\n",
       "       Torgersen  4034.782609    23"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">body_mass_g</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sex</th>\n",
       "      <th>island</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">Female</th>\n",
       "      <th>Biscoe</th>\n",
       "      <td>4319.375000</td>\n",
       "      <td>80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Dream</th>\n",
       "      <td>3446.311475</td>\n",
       "      <td>61</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Torgersen</th>\n",
       "      <td>3395.833333</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">Male</th>\n",
       "      <th>Biscoe</th>\n",
       "      <td>5104.518072</td>\n",
       "      <td>83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Dream</th>\n",
       "      <td>3987.096774</td>\n",
       "      <td>62</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Torgersen</th>\n",
       "      <td>4034.782609</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 13
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
